Proposal

Part 1: Data Collection

Using the YouTube API, I’m able to collect search results given a search term (~500). I create search terms that target different levels (beginner, intermediate, advanced) of topic difficulty.
# YouTube v3 API keys from Google Cloud Platform go here
clientId <- ""
clientSecret <- ""
library(tuber)

# Delete .httr-oauth to reset
if (clientId != "" & clientSecret != "") {
  yt_oauth(
    app_id = clientId,
    app_secret = clientSecret
  )
}
library(readr)
library(tuber)

directory <- "data/"

terms <- c(
  "R Programming", 
  "Beginner R Programming",
  "Intermediate R Programming",
  "Advanced R Programming"
)

numTerms <- length(terms)

# Grab general info from search endpoint for each term
for (x in 1:numTerms) {
  term <- terms[x]
  file <- paste(term, ".csv", sep = "")
  
  filePath <- paste(directory, file, sep = "")

  if (!file.exists(filePath)) {
    videos <- yt_search(term)
    videos <- videos[, c(1,2,3,4,5,6,15)]
    # [1] "video_id"                  "publishedAt"               "channelId"                
    # [4] "title"                     "description"               "thumbnails.default.url"   
    # [7] "thumbnails.default.width"  "thumbnails.default.height" "thumbnails.medium.url"    
    #[10] "thumbnails.medium.width"   "thumbnails.medium.height"  "thumbnails.high.url"      
    #[13] "thumbnails.high.width"     "thumbnails.high.height"    "channelTitle"             
    #[16] "liveBroadcastContent"      "publishTime"
  
    videos <- videos %>%
      add_column(viewCount = NA,
                 likeCount = NA,
                 dislikeCount = NA,
                 favoriteCount = NA,
                 commentCount = NA,
                 tags = NA)
  
    videos
    write_csv(videos, filePath)
  }
}
After receiving general information on each video under the term, I’m able to use its id to request more details such as full description, metadata tags, and performance metrics (views, likes, dislikes, comments). Because of API limitations, detail calls are limited to the top 100 videos (sorted by relevancy) returned for each term.
library(readr)
library(tuber)
library(dplyr)
library(lubridate)

directory <- "data/"

files <- c(
  "R Programming.csv", 
  "Beginner R Programming.csv",
  "Intermediate R Programming.csv",
  "Advanced R Programming.csv"
)

numFiles <- length(files)

# Get details for top 100 results per search (sorted by relevancy)
for (x in 1:numFiles) {
  file <- files[x]
  filePath <- paste(directory, file, sep = "")
  
  videos <- read_csv(filePath)
  
  if (!any(names(videos) == "viewCount")) {
    videos <- videos %>%
      add_column(viewCount = NA,
                 likeCount = NA,
                 dislikeCount = NA,
                 favoriteCount = NA,
                 commentCount = NA,
                 tags = NA)
    
    numVideos <- nrow(videos)
    maxRelevant <- 100
    
    for (y in 1:maxRelevant) {
      videoId <- videos[[y,1]]
      channelId <- videos[[y,3]]
    
      
      
      stats <- get_stats(videoId)
      videos[y, 8] <- if ("viewCount" %in% names(stats) && !is.null(stats[["viewCount"]])) as.double(stats$viewCount) else 0
      videos[y, 9] <- if ("likeCount" %in% names(stats) && !is.null(stats[["likeCount"]])) as.double(stats$likeCount) else 0
      videos[y, 10] <- if ("dislikeCount" %in% names(stats) && !is.null(stats[["dislikeCount"]])) as.double(stats$dislikeCount) else 0
      videos[y, 11] <- if ("favoriteCount" %in% names(stats) && !is.null(stats[["favoriteCount"]])) as.double(stats$favoriteCount) else 0
      videos[y, 12] <- if ("commentCount" %in% names(stats) && !is.null(stats[["commentCount"]])) as.double(stats$commentCount) else 0
      
      
      
      details <- get_video_details(videoId)
      items <- details$items[[1]]
      snippet <- items$snippet
      
      tags <- if ("tags" %in% names(snippet) && !is.null(snippet[["tags"]])) snippet$tags else c()
      numTags <- length(tags)
      tagConcat <- ""
        
      for (z in 1:numTags) {
        tagConcat <- paste(tagConcat, tags[[z]], sep = if (z == 1) "" else ",")
      }
      
      videos[y, 2] <- round_date(videos[y, 2], "day")
      videos[y, 5] <- if ("description" %in% names(snippet) && !is.null(snippet[["description"]])) snippet$description else ""
      videos[y, 13] <- tagConcat
      
      
      
      #captions <- get_captions(videoId)
      
      
      
      #comments <- get_comment_threads(c(video_id = videoId))
    }
  }
}

videos
## # A tibble: 601 × 13
##    video_id    publishedAt         channelId  title description thumbnails.defa…
##    <chr>       <dttm>              <chr>      <chr> <chr>       <chr>           
##  1 aHiwr3-xJLM 2020-09-17 00:00:00 UCKacOPkh… R ad… "Download … https://i.ytimg…
##  2 H3ao7LzcvW8 2020-12-23 00:00:00 UCTZXht1R… Adva… "In this v… https://i.ytimg…
##  3 VVUn_R-ExlQ 2016-08-26 00:00:00 UCGwuewhd… UC B… "Module 10… https://i.ytimg…
##  4 BKbG0LFNjic 2019-08-29 00:00:00 UCmG1AOBM… Adva… "CRC Press… https://i.ytimg…
##  5 WkF7nqEYF1E 2021-08-27 00:00:00 UC3xfbCML… R Ma… "R Markdow… https://i.ytimg…
##  6 JLvigQQZ5VE 2018-02-05 00:00:00 UCifB-UuE… Adva…  <NA>       https://i.ytimg…
##  7 KlsYCECWEWE 2020-08-29 00:00:00 UCsvqVGtb… R Pr… "🔥 Enroll… https://i.ytimg…
##  8 bzUmK0Y07ck 2019-06-29 00:00:00 UCPyA0XmU… &quo… "Title: Th… https://i.ytimg…
##  9 UwzGhMndWzs 2020-05-09 00:00:00 UCCaChdLM… Adva… "Asmae Tou… https://i.ytimg…
## 10 iROHLA_TXQM 2020-12-18 00:00:00 UCsvqVGtb… R Fo… "In this v… https://i.ytimg…
## # … with 591 more rows, and 7 more variables: channelTitle <chr>,
## #   viewCount <dbl>, likeCount <dbl>, dislikeCount <dbl>, favoriteCount <dbl>,
## #   commentCount <dbl>, tags <chr>
write_csv(videos, filePath)

Part 2: Data Cleaning

For smoother handling, I then compile the sets into a super set but make sure to tag them with their parent search term for future filtering.
For an extra dimension, using a secondary data source, I perform date manipulation and comparison to find what the latest R release was at the time of a video’s publishing. Perhaps one version is far more popular, or served as a much larger entry point for learners when compared to others.
library(readr)
library(dplyr)
library(lubridate)
library(stringr)
library(tibble)

directory <- "data/"

rFilePath <- paste(directory, "R Programming.csv", sep = "")
beginnerFilePath <- paste(directory, "Beginner R Programming.csv", sep = "")
intermediateFilePath <- paste(directory, "Intermediate R Programming.csv", sep = "")
advancedFilePath <- paste(directory, "Advanced R Programming.csv", sep = "")

rVideos <- read_csv(rFilePath)
beginnerVideos <- read_csv(beginnerFilePath)
intermediateVideos <- read_csv(intermediateFilePath)
advancedVideos <- read_csv(advancedFilePath)



maxRelevant <- 100

# Dedupe records from generic search and other targeted searches for goo measure
beginnerVideosUnique <- beginnerVideos[1:maxRelevant,] %>%
  setdiff(rVideos) %>%
  setdiff(intermediateVideos) %>%
  setdiff(advancedVideos)

intermediateVideosUnique <- intermediateVideos[1:maxRelevant,] %>%
  setdiff(rVideos) %>%
  setdiff(beginnerVideos) %>%
  setdiff(advancedVideos)

advancedVideosUnique <- advancedVideos[1:maxRelevant,] %>%
  setdiff(rVideos) %>%
  setdiff(beginnerVideos) %>%
  setdiff(intermediateVideos)



beginnerVideosUnique <- beginnerVideosUnique %>%
  add_column(level = "beginner",
             rVersion = NA,
             relevant = NA)

intermediateVideosUnique <- intermediateVideosUnique %>%
  add_column(level = "intermediate",
             rVersion = NA,
             relevant = NA)

advancedVideosUnique <- advancedVideosUnique %>%
  add_column(level = "advanced",
             rVersion = NA,
             relevant = NA)



# Check relevancy in deduped according to the appearance of "R" in title, description, or tag
beginnerVideosUnique <- beginnerVideosUnique %>%
  mutate(publishedAt = round_date(beginnerVideosUnique$publishedAt, "day"),
         relevant = str_detect(beginnerVideosUnique$title, regex("\\bR\\b", ignore_case = TRUE))
                    | str_detect(beginnerVideosUnique$tags, regex("\\bR\\b", ignore_case = TRUE))
                    | str_detect(beginnerVideosUnique$description, regex("\\bR\\b", ignore_case = TRUE)))

intermediateVideosUnique <- intermediateVideosUnique %>%
  mutate(publishedAt = round_date(intermediateVideosUnique$publishedAt, "day"),
         relevant = str_detect(intermediateVideosUnique$title, regex("\\bR\\b", ignore_case = TRUE))
                    | str_detect(intermediateVideosUnique$tags, regex("\\bR\\b", ignore_case = TRUE))
                    | str_detect(intermediateVideosUnique$description, regex("\\bR\\b", ignore_case = TRUE)))

advancedVideosUnique <- advancedVideosUnique %>%
  mutate(publishedAt = round_date(advancedVideosUnique$publishedAt, "day"),
         relevant = str_detect(advancedVideosUnique$title, regex("\\bR\\b", ignore_case = TRUE))
                    | str_detect(advancedVideosUnique$tags, regex("\\bR\\b", ignore_case = TRUE))
                    | str_detect(advancedVideosUnique$description, regex("\\bR\\b", ignore_case = TRUE)))



rVersionFile <- "R Versions.csv"
rVersionsPath <- paste(directory, rVersionFile, sep = "")
rVersions <- read_delim(rVersionsPath, trim_ws = TRUE)

if (!any(names(rVersions) == "dateReleased")) {
  rVersions <- rVersions %>%
    add_column(dateReleased = NA)
    
  dateSplits <- str_split(rVersions$date, ",")
  
  numDateSplits <- length(dateSplits)
  
  for (x in 1:numDateSplits) {
    rVersions[x,3] <- as.POSIXct(paste(
      dateSplits[[x]][2],
      match(dateSplits[[x]][1], month.name),
      "01",
      sep = "-"
    ), format = "%Y-%m-%d")
  }
  
  write_csv(rVersions, paste(directory, rVersionFile, sep = ""))
}


# Check R version available at time of video release
numVersions <- nrow(rVersions)

numBeginnerVideos <- nrow(beginnerVideosUnique)

for (x in 1:numBeginnerVideos) {
  loops <- 1
  
  while (loops <= numVersions) {
    if (beginnerVideosUnique[x, 2] < rVersions[loops, 3]) {
      loops <- loops + 1
    }
    
    else {
      beginnerVideosUnique[x, 15] <- rVersions[loops, 1]
      break
    }
  }
}

numIntermediateVideos <- nrow(intermediateVideosUnique)

for (x in 1:numIntermediateVideos) {
  loops <- 1
  
  while (loops <= numVersions) {
    if (intermediateVideosUnique[x, 2] < rVersions[loops, 3]) {
      loops <- loops + 1
    }
    
    else {
      intermediateVideosUnique[x, 15] <- rVersions[loops, 1]
      break
    }
  }
}

numAdvancedVideos <- nrow(advancedVideosUnique)

for (x in 1:numAdvancedVideos) {
  loops <- 1
  
  while (loops <= numVersions) {
    if (advancedVideosUnique[x, 2] < rVersions[loops, 3]) {
      loops <- loops + 1
    }
    
    else {
      advancedVideosUnique[x, 15] <- rVersions[loops, 1]
      break
    }
  }
}



uniqueAll <- union(union(beginnerVideosUnique, intermediateVideosUnique), advancedVideosUnique)

write_csv(beginnerVideosUnique, paste(directory, "Beginner R Programming Unique.csv", sep = ""))
write_csv(intermediateVideosUnique, paste(directory, "Intermediate R Programming Unique.csv", sep = ""))
write_csv(advancedVideosUnique, paste(directory, "Advanced R Programming Unique.csv", sep = ""))
write_csv(uniqueAll, paste(directory, "R Programming Unique.csv", sep = ""))

Part 3: Data Analysis

Looking at our data sets, I compare performance by level of difficulty. I also try to extract buzz words and prominent topics at each level looking at frequency of words in the title, description, and metadata. Surprisingly, intermediate level R videos perform the best.
library(readr)
library(ggplot2)
library(runner)
library(stringr)
library(tidytext)
library("stopwords")
library("wordcloud")
library(ggrepel)
library(psych)

directory <- "data/"

rUniqueFilePath <- paste(directory, "R Programming Unique.csv", sep = "")
rVideosUnique <- read_delim(rUniqueFilePath, trim_ws = TRUE)

rVideosUniqueRelevant <- rVideosUnique %>% 
  filter(relevant == TRUE)

# "Misinformation"
# 66/92 = 71%
beginnerVideosUniqueRelevant <- rVideosUnique %>%
       filter(level == "beginner")

# 71/95 = 75%
intermediateVideosUniqueRelevant <- rVideosUnique %>% 
  filter(level == "intermediate")

# 90/91 = 99%
advancedVideosUniqueRelevant <- rVideosUnique %>% 
  filter(level == "advanced")



# When did the market saturate?
ggplot(beginnerVideosUniqueRelevant, aes(publishedAt, viewCount)) +
  ggtitle("Beginner Relevancy by Date") +
  geom_point(aes(colour = factor(relevant)))

ggplot(intermediateVideosUniqueRelevant, aes(publishedAt, viewCount)) +
  ggtitle("Intermediate Relevancy by Date") +
  geom_point(aes(colour = factor(relevant)))

ggplot(advancedVideosUniqueRelevant, aes(publishedAt, viewCount)) +
  ggtitle("Advanced Relevancy by Date") +
  geom_point(aes(colour = factor(relevant)))

# Assuming most of a video's traffic occurs within the first couple of months, then viewership has increased at a steady rate
# Amount of older videos can also speak to scarcity of newer videos as sorted by relevancy, one might expect new videos to be more relevant
ggplot(rVideosUniqueRelevant %>%
         arrange(publishedAt), aes(publishedAt, sum_run(
           x = viewCount, 
           idx = publishedAt
          ))
  ) +
  ggtitle("Cumulative Views by Date") +
  geom_smooth(method = lm) + 
  geom_point()

ggplot(rVideosUniqueRelevant, aes(publishedAt, viewCount)) +
  ggtitle("Views by Date") +
  geom_point(aes(colour = factor(level)))

ggplot(rVideosUniqueRelevant %>%
         filter(viewCount < 1000), aes(publishedAt, viewCount)) +
  ggtitle("Views by Date") +
  geom_point(aes(colour = factor(level)))

ggplot(rVideosUniqueRelevant, aes(rVersion, viewCount)) +
  geom_point(aes(colour = factor(level))) +
  ggtitle("Views by R version") +
  theme(axis.text.x = element_text(angle = 45))

rVideosUniqueRelevant$viewCount %>%
  describe
##    vars   n    mean       sd median trimmed    mad min     max   range skew
## X1    1 227 36420.6 154786.2    196 5635.71 268.35   0 1814387 1814387 8.38
##    kurtosis       se
## X1    83.93 10273.52
rVideosUniqueRelevant %>%
  summary
##    video_id          publishedAt                   channelId        
##  Length:227         Min.   :2010-02-07 00:00:00   Length:227        
##  Class :character   1st Qu.:2019-06-01 00:00:00   Class :character  
##  Mode  :character   Median :2020-09-17 00:00:00   Mode  :character  
##                     Mean   :2019-08-22 20:05:17                     
##                     3rd Qu.:2021-01-22 00:00:00                     
##                     Max.   :2021-10-30 00:00:00                     
##     title           description        thumbnails.default.url
##  Length:227         Length:227         Length:227            
##  Class :character   Class :character   Class :character      
##  Mode  :character   Mode  :character   Mode  :character      
##                                                              
##                                                              
##                                                              
##  channelTitle         viewCount         likeCount        dislikeCount   
##  Length:227         Min.   :      0   Min.   :    0.0   Min.   :  0.00  
##  Class :character   1st Qu.:     37   1st Qu.:    1.0   1st Qu.:  0.00  
##  Mode  :character   Median :    196   Median :    4.0   Median :  0.00  
##                     Mean   :  36421   Mean   :  649.5   Mean   : 10.65  
##                     3rd Qu.:   4324   3rd Qu.:   64.5   3rd Qu.:  1.50  
##                     Max.   :1814387   Max.   :41152.0   Max.   :513.00  
##  favoriteCount  commentCount        tags              level          
##  Min.   :0     Min.   :  0.00   Length:227         Length:227        
##  1st Qu.:0     1st Qu.:  0.00   Class :character   Class :character  
##  Median :0     Median :  0.00   Mode  :character   Mode  :character  
##  Mean   :0     Mean   : 34.89                                        
##  3rd Qu.:0     3rd Qu.:  4.50                                        
##  Max.   :0     Max.   :886.00                                        
##    rVersion         relevant      
##  Length:227         Mode:logical  
##  Class :character   TRUE:227      
##  Mode  :character                 
##                                   
##                                   
## 
beginnerVideosUniqueRelevant$viewCount %>%
  describe
##    vars  n     mean      sd median  trimmed     mad min      max    range skew
## X1    1 92 465864.9 1874922 2635.5 101214.3 3905.17   0 13934381 13934381 6.09
##    kurtosis       se
## X1    37.76 195474.1
beginnerVideosUniqueRelevant %>%
  summary
##    video_id          publishedAt                   channelId        
##  Length:92          Min.   :2013-10-31 00:00:00   Length:92         
##  Class :character   1st Qu.:2019-02-21 12:00:00   Class :character  
##  Mode  :character   Median :2020-04-16 12:00:00   Mode  :character  
##                     Mean   :2019-09-13 10:26:05                     
##                     3rd Qu.:2020-09-22 06:00:00                     
##                     Max.   :2021-10-30 00:00:00                     
##     title           description        thumbnails.default.url
##  Length:92          Length:92          Length:92             
##  Class :character   Class :character   Class :character      
##  Mode  :character   Mode  :character   Mode  :character      
##                                                              
##                                                              
##                                                              
##  channelTitle         viewCount          likeCount         dislikeCount    
##  Length:92          Min.   :       0   Min.   :     0.0   Min.   :   0.00  
##  Class :character   1st Qu.:      88   1st Qu.:     1.0   1st Qu.:   0.00  
##  Mode  :character   Median :    2636   Median :    45.5   Median :   1.00  
##                     Mean   :  465865   Mean   : 10781.2   Mean   : 186.73  
##                     3rd Qu.:  163410   3rd Qu.:  3679.8   3rd Qu.:  62.25  
##                     Max.   :13934381   Max.   :336767.0   Max.   :4412.00  
##  favoriteCount  commentCount         tags              level          
##  Min.   :0     Min.   :    0.0   Length:92          Length:92         
##  1st Qu.:0     1st Qu.:    0.0   Class :character   Class :character  
##  Median :0     Median :    3.5   Mode  :character   Mode  :character  
##  Mean   :0     Mean   :  457.0                                        
##  3rd Qu.:0     3rd Qu.:  222.2                                        
##  Max.   :0     Max.   :15001.0                                        
##    rVersion          relevant      
##  Length:92          Mode :logical  
##  Class :character   FALSE:25       
##  Mode  :character   TRUE :66       
##                     NA's :1        
##                                    
## 
intermediateVideosUniqueRelevant$viewCount %>%
  describe
##    vars  n     mean     sd median  trimmed   mad min     max   range skew
## X1    1 95 220691.8 739024    619 35749.96 911.8   0 4617704 4617704 4.41
##    kurtosis       se
## X1    20.27 75822.26
intermediateVideosUniqueRelevant %>%
  summary
##    video_id          publishedAt                   channelId        
##  Length:95          Min.   :2010-02-07 00:00:00   Length:95         
##  Class :character   1st Qu.:2017-11-13 12:00:00   Class :character  
##  Mode  :character   Median :2020-05-05 00:00:00   Mode  :character  
##                     Mean   :2018-09-25 20:27:47                     
##                     3rd Qu.:2021-01-09 12:00:00                     
##                     Max.   :2021-12-02 00:00:00                     
##     title           description        thumbnails.default.url
##  Length:95          Length:95          Length:95             
##  Class :character   Class :character   Class :character      
##  Mode  :character   Mode  :character   Mode  :character      
##                                                              
##                                                              
##                                                              
##  channelTitle         viewCount         likeCount      dislikeCount    
##  Length:95          Min.   :      0   Min.   :    0   Min.   :   0.00  
##  Class :character   1st Qu.:     62   1st Qu.:    1   1st Qu.:   0.00  
##  Mode  :character   Median :    619   Median :    6   Median :   0.00  
##                     Mean   : 220692   Mean   : 3954   Mean   :  71.79  
##                     3rd Qu.:  75637   3rd Qu.:  567   3rd Qu.:   6.50  
##                     Max.   :4617704   Max.   :87535   Max.   :1679.00  
##  favoriteCount  commentCount        tags              level          
##  Min.   :0     Min.   :   0.0   Length:95          Length:95         
##  1st Qu.:0     1st Qu.:   0.0   Class :character   Class :character  
##  Median :0     Median :   0.0   Mode  :character   Mode  :character  
##  Mean   :0     Mean   : 108.4                                        
##  3rd Qu.:0     3rd Qu.:  52.5                                        
##  Max.   :0     Max.   :2227.0                                        
##    rVersion          relevant      
##  Length:95          Mode :logical  
##  Class :character   FALSE:19       
##  Mode  :character   TRUE :71       
##                     NA's :5        
##                                    
## 
advancedVideosUniqueRelevant$viewCount %>%
  describe
##    vars  n    mean       sd median trimmed    mad min   max range skew kurtosis
## X1    1 91 4823.18 13816.58     89 1321.68 118.61   0 86035 86035  4.1    17.89
##         se
## X1 1448.37
advancedVideosUniqueRelevant %>%
  summary
##    video_id          publishedAt                   channelId        
##  Length:91          Min.   :2015-11-25 00:00:00   Length:91         
##  Class :character   1st Qu.:2020-07-12 00:00:00   Class :character  
##  Mode  :character   Median :2020-11-05 00:00:00   Mode  :character  
##                     Mean   :2020-06-26 00:15:49                     
##                     3rd Qu.:2021-03-07 00:00:00                     
##                     Max.   :2021-10-28 00:00:00                     
##     title           description        thumbnails.default.url
##  Length:91          Length:91          Length:91             
##  Class :character   Class :character   Class :character      
##  Mode  :character   Mode  :character   Mode  :character      
##                                                              
##                                                              
##                                                              
##  channelTitle         viewCount       likeCount        dislikeCount   
##  Length:91          Min.   :    0   Min.   :   0.00   Min.   : 0.000  
##  Class :character   1st Qu.:   31   1st Qu.:   0.00   1st Qu.: 0.000  
##  Mode  :character   Median :   89   Median :   2.00   Median : 0.000  
##                     Mean   : 4823   Mean   :  92.35   Mean   : 1.604  
##                     3rd Qu.: 1263   3rd Qu.:  17.50   3rd Qu.: 0.000  
##                     Max.   :86035   Max.   :1426.00   Max.   :22.000  
##  favoriteCount  commentCount         tags              level          
##  Min.   :0     Min.   :  0.000   Length:91          Length:91         
##  1st Qu.:0     1st Qu.:  0.000   Class :character   Class :character  
##  Median :0     Median :  0.000   Mode  :character   Mode  :character  
##  Mean   :0     Mean   :  6.681                                        
##  3rd Qu.:0     3rd Qu.:  2.000                                        
##  Max.   :0     Max.   :188.000                                        
##    rVersion          relevant      
##  Length:91          Mode :logical  
##  Class :character   FALSE:1        
##  Mode  :character   TRUE :90       
##                                    
##                                    
## 
ggplot(rVideosUniqueRelevant, aes(level, viewCount)) +
  ggtitle("Views by Difficulty") +
  geom_boxplot()

ggplot(rVideosUniqueRelevant, aes(level, likeCount)) +
  ggtitle("Likes by Difficulty") +
  geom_boxplot()

ggplot(rVideosUniqueRelevant, aes(level, dislikeCount)) +
  ggtitle("Dislikes by Difficulty") +
  geom_boxplot()

ggplot(rVideosUniqueRelevant, aes(level, commentCount)) +
  ggtitle("Comments by Difficulty") +
  geom_boxplot()

rVideosUniqueRelevantChannels <- rVideosUniqueRelevant %>%
  group_by(channelTitle) %>%
  summarise(n = n(), viewCount = sum(viewCount))
          
ggplot(rVideosUniqueRelevantChannels, aes(n, viewCount)) +
  geom_point() +
  ggtitle("All Channel Videos Count by Views") +
  geom_label_repel(aes(label = channelTitle),
                   box.padding   = 0.35, 
                   point.padding = 0.5,
                   segment.color = 'grey50')

beginnerVideosUniqueRelevantChannels <- beginnerVideosUniqueRelevant %>%
  group_by(channelTitle) %>%
  summarise(n = n(), viewCount = sum(viewCount))

ggplot(beginnerVideosUniqueRelevantChannels, aes(n, viewCount)) +
    geom_point() +
  ggtitle("Beginner Channel Video Count by Views") +
    geom_label_repel(aes(label = channelTitle),
                     box.padding   = 0.35, 
                     point.padding = 0.5,
                     segment.color = 'grey50')

intermediateVideosUniqueRelevantChannels <- intermediateVideosUniqueRelevant %>%
  group_by(channelTitle) %>%
  summarise(n = n(), viewCount = sum(viewCount))

ggplot(intermediateVideosUniqueRelevantChannels, aes(n, viewCount)) +
  geom_point() +
  ggtitle("Intermediate Channel Video Count by Views") +
  geom_label_repel(aes(label = channelTitle),
                   box.padding   = 0.35, 
                   point.padding = 0.5,
                   segment.color = 'grey50')

advancedVideosUniqueRelevantChannels <- advancedVideosUniqueRelevant %>%
  group_by(channelTitle) %>%
  summarise(n = n(), viewCount = sum(viewCount))
                   
ggplot(advancedVideosUniqueRelevantChannels, aes(n, viewCount)) +
  geom_point() +
  ggtitle("Advanced Channel Video Count by Views") +
  geom_label_repel(aes(label = channelTitle),
                   box.padding   = 0.35, 
                   point.padding = 0.5,
                   segment.color = 'grey50')

maxWords = 100
filterRegex <- ".*[0-9][0-9]+$|^[0-9]*$|.*\\..*"

titleWords <- rVideosUniqueRelevant[,4] %>% 
  unnest_tokens(output = word, input = title) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)

wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

titleWords <- beginnerVideosUniqueRelevant[,4] %>% 
  unnest_tokens(output = word, input = title) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)

wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

titleWords <- intermediateVideosUniqueRelevant[,4] %>% 
  unnest_tokens(output = word, input = title) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)

wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

titleWords <- advancedVideosUniqueRelevant[,4] %>% 
  unnest_tokens(output = word, input = title) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)

wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

descriptionWords <- rVideosUniqueRelevant[,5] %>% 
  unnest_tokens(output = word, input = description) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)

wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

descriptionWords <- beginnerVideosUniqueRelevant[,5] %>% 
  unnest_tokens(output = word, input = description) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)

wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

descriptionWords <- intermediateVideosUniqueRelevant[,5] %>% 
  unnest_tokens(output = word, input = description) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)

wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

descriptionWords <- advancedVideosUniqueRelevant[,5] %>% 
  unnest_tokens(output = word, input = description) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)

wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

rVideosUniqueRelevant$tags <- gsub(",", " ", rVideosUniqueRelevant$tags)

tagsWords <- rVideosUniqueRelevant[,13] %>% 
  unnest_tokens(output = word, input = tags) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)

wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)

beginnerVideosUniqueRelevant$tags <- gsub(",", " ", beginnerVideosUniqueRelevant$tags)

tagsWords <- beginnerVideosUniqueRelevant[,13] %>% 
  unnest_tokens(output = word, input = tags) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)

wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)

intermediateVideosUniqueRelevant$tags <- gsub(",", " ", intermediateVideosUniqueRelevant$tags)

tagsWords <- intermediateVideosUniqueRelevant[,13] %>% 
  unnest_tokens(output = word, input = tags) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)

wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)

advancedVideosUniqueRelevant$tags <- gsub(",", " ", advancedVideosUniqueRelevant$tags)

tagsWords <- advancedVideosUniqueRelevant[,13] %>% 
  unnest_tokens(output = word, input = tags) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)

wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)